from IPython.display import Image
Image(filename='./images/3-0-0-0_opening.jpg')
In this lesson, you will learn about methods such as hill climbing, simulated annealing, and adaptive noise scaling. You'll also learn about cross-entropy methods and evolution strategies.
In this lesson, you'll study REINFORCE, along with improvements we can make to lower the variance of policy gradient algorithms.
In this lesson, you'll learn about Proximal Policy Optimization (PPO), a cutting-edge policy gradient method.
In this lesson, you'll learn how to combine value-based and policy-based methods, bringing together the best of both worlds, to solve challenging reinforcement learning problems.
In this optional lesson, you'll learn how to apply deep reinforcement learning techniques for optimal execution of portfolio transactions.
from IPython.display import Image
Image(filename='./images/3-1-1-1_value_based_methods_with_discrete_state.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-1-2_value_based_methods_with_continuous_state.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-1-3_value_based_methods_with_continuous_state_using_Deep_Q_Learning.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-1-4_value_based_methods_estimate_optimal_value_function_first_before_optimal_policy.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-1-5_policy_based_methods.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-2-1_policy_function_approximation_cartpole_has_two_actions.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-2-2_policy_function_approximation_neural_newtwork_return_posibilities.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-2-3_policy_function_approximation_agent_learns_how_to_maximize_reword_interactively.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-3-1_neural_network_encodes_action_probabilities.png')
[ref] {https://blog.openai.com/evolution-strategies/}
from IPython.display import Image
Image(filename='./images/3-1-3-2_continuous_action_space_in_bipedal_walker.png')
from IPython.display import Image
Image(filename='./images/3-1-3-3_continuous_action_space_in_mountain_car_continuous.png')
[hill climbing] {https://en.wikipedia.org/wiki/Hill_climbing} is not just for reinforcement learning! It is a general optimization method that is used to find the maximum of a function.
from IPython.display import Image
Image(filename='./images/3-1-4-1_hill_climbiing_neural_network_input_and_output.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-4-2_hill_climbiing_neural_relation_between_j_theta.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-4-3_hill_climbiing_neural_gradient_ascent.jpeg')
Gradient ascent is similar to gradient descent.
* Gradient descent steps in the direction opposite the gradient, since it wants to minimize a function.
* Gradient ascent is otherwise identical, except we step in the direction of the gradient, to reach the maximum.
from IPython.display import Image
Image(filename='./images/3-1-4-4_hill_climbiing_reach_maximum_value_of_function.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-4-5_hill_climbiing_reach_optimal_value.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-5-1_hill_climbiing_pseudocode.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-6-1_hill_climbiing_we_dont_know_j.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-6-2_hill_climbiing_with_sochastic_policy_search_returns_objective_value.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-6-3_hill_climbiing_policy_is_somewhere_on_the_objective_function_surface.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-6-4_hill_climbiing_change_parameters_by_adding_gaussian_noise.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-6-5_hill_climbiing_set_this_policy_to_new_best_policy.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-6-6_hill_climbiing_iterate_until_top_of_the_hill.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-6-7_beyond_hill_climbiing_steepest_ascent.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-6-8_beyond_hill_climbiing_simulated_annealing.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-6-9_beyond_hill_climbiing_adaptive_noise_same_as_sa.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-6-10_beyond_hill_climbiing_adaptive_noise_extend_serach_radius_when_is_not_best.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-7-1_steepest_ascent_dont_use_usuful_imformation_from_not_selected.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-7-2_cross_entropy_method_select_top_n_and_use_average_of_them.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-7-3_evolution_strategies_the_best_policy_is_weighted_sum_of_all_selected.jpeg')
why do we need policy-based methods at all, when value-based methods work so well?
from IPython.display import Image
Image(filename='./images/3-1-8-1_policy_based_method_why.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-8-2_policy_based_method_simplicity.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-8-3_policy_based_method_policy_look_like.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-8-4_policy_based_method_stochastic_plicies.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-8-5_value_based_method_e-greedy_is_a_hack.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-8-6_policy_based_method_aliased_states.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-8-7_policy_based_method_aliased_states_if_using_value_function.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-8-8_policy_based_method_aliased_states_if_using_value_function_then_keep_oscillating_never_get_out.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-8-9_policy_based_method_aliased_states_if_using_policy_based_method_then_learn_desired_stochastic_policy.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-8-10_policy_based_method_discret_action_space.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-8-11_policy_based_method_continuous_action_space.jpeg')
from IPython.display import Image
Image(filename='./images/3-1-8-12_policy_based_method_high-dimensional_continuous_action_space.jpeg')
Policy gradient methods are a subclass of policy-based methods.
from IPython.display import Image
Image(filename='./images/3-2-1-1_policy_gradient_methods_is_subset_of_policy_based_methods.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-1-2_policy_gradient_methods_chicken_cross_the_road.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-1-3_policy_gradient_methods_case_of_four_possible_actions.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-1-4_policy_gradient_methods_possibly_cnn_is_best.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-1-5_policy_gradient_methods_reward_only_delivered_at_the_end_of_game.jpeg')
Before digging into the details of policy gradient methods, we'll discuss how they work at a high level.
from IPython.display import Image
Image(filename='./images/3-2-2-1_pg_big_picture_case_of_win.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-2-2_pg_big_picture_case_of_win_getting_action_posibilities_from_1st_timestamp.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-2-3_pg_big_picture_case_of_win_change_parameters_a_litte_bit_to_direction_winning_game.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-2-4_pg_big_picture_case_of_win_getting_action_posibilities_from_2nd_timestamp.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-2-5_pg_big_picture_case_of_win_change_parameters_a_litte_bit_to_up_direction_winning_game.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-2-6_pg_big_picture_case_of_lost.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-2-7_pg_big_picture_case_of_lost_getting_action_posibilities_from_1st_timestamp.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-2-8_pg_big_picture_case_of_lost_change_parameters_a_litte_bit_to_direction_winning_game.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-2-9_pg_big_picture_pseudocode.jpeg')
Policy gradient methods are very similar to supervised learning.
from IPython.display import Image
Image(filename='./images/3-2-3-1_pg_is_similar_to_supervised_learning.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-3-2_pg_is_similar_to_sl_mutiple_conflicting_opinions.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-3-3_more_learn_connection_between_sl_with_rl.png')
Define how policy gradient methods will work.
from IPython.display import Image
Image(filename='./images/3-2-4-1_how_pg_work_trajectory_is_state_action_sequence.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-4-2_how_pg_work_trajectory_does_not_keep_track_rewards.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-4-3_how_pg_work_deep_dive.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-5-1_reinforce_check_our_goal_first.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-5-2_reinforce_one_way_to_achieve_this_goal_is_gradient_ascent.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-5-3_reinforce_ga_vs_gd.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-5-4_reinforce_calculating_gradient_is_very_expensive.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-5-5_reinforce_estimate_gradient_and_consider_a_few_trajectories.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-5-6_reinforce_what_if_m_equals_one_trajectory.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-5-7_reinforce_detailed_meaning_when_one_trajectory.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-5-8_reinforce_detailed_meaning_direction_of_steepest_increase_ot_the_probability.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-5-9_reinforce_detailed_meaning_when_multiple_trajectories.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-5-10_reinforce_pseudocode.jpeg')
how to derive the equation that we use to approximate the gradient
from IPython.display import Image
Image(filename='./images/3-2-6-1_reinforce_derivation_of_equation_that_aproximate_the_gradient.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-6-2_reinforce_derivation_of_equation_likelihood_ratio_policy_gradient.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-6-3_reinforce_equation_likelihood_ratio_policy_gradient.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-6-4_reinforce_derivation_of_equation_sample_based_estimate.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-6-5_reinforce_derivation_of_equation_simplify.jpeg')
from IPython.display import Image
Image(filename='./images/3-2-6-6_reinforce_probility_density_function_correspoding_to_normal_distribution.png')
One of these key improvements is called Proximal Policy Optimization (PPO) -- also closely related to Trust Region Policy Optimization (TRPO). It has allowed faster and more stable learning. From developing agile robots, to creating expert level gaming AI, PPO has proven useful in a wide domain of applications, and has become part of the standard toolkits in complicated learning environments.
from IPython.display import Image
Image(filename='./images/3-3-1-1_beyond_reinforce_review_reinforce.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-2-1_noise_reduction_sampled_trajectories_do_not_contain_that_much_information_about_our_policy_because_of_random_noise.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-2-2_noise_reduction_easiet_option_to_reduce_noise_is_simply_sample_more_trajectories.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-2-3_noise_reduction_another_option_is_reward_normalization.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-3-1_credit_assignment_take_closer_look_at_the_total_reward_r.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-3-2_credit_assignment_past_rewards_do_not_affect_to_current_action_we_assign_credit_to_current_action.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-3-3_credit_assignment_simply_have_the_future_reward_as_the_coefficient.jpeg')
Suppose we are training an agent to play a computer game. There are only two possible action:
There are three time-steps in each game, and our policy is completely determined by one parameter θ, such that the probability of "moving" is θ, and the probability of doing nothing is 1−θ.
Initially θ=0.5. Three games are played, the results are:
What is the policy gradient computed from the second game, using future rewards?
Which of these statements are true regarding the 3rd game?
from IPython.display import Image
Image(filename='./images/3-3-4-1_importance_sampling_data_recycling.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-4-2_importance_sampling_generate_trajectory_with_policy_pi_theta.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-4-3_importance_sampling_compute_policy_gradient_and_update_theta_to_theta_prime.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-4-4_importance_sampling_throw_away_just_generated_trajectory.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-4-5_importance_sampling_recycle_the_old_trajectories.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-4-6_importance_sampling_reuse_the_recycled_trajectories_to_compute_gradients_and_update_policy.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-4-7_importance_sampling_generated_using_the_policy_pi_theta_is_same_trajectory_by_new_policy_different_probability.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-4-8_importance_sampling_we_want_compute_the_average_of_some_quantity_say_f_of_Tau.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-4-9_importance_sampling_weighted_by_a_probability_of_sampling_each_trajectory.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-4-10_multiplying_and_dividing_by_the_same_number_P(τ;θ).jpeg')
from IPython.display import Image
Image(filename='./images/3-3-4-11_importance_sampling_rearrange_the_terms.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-4-12_importance_sampling_under_the_old_policy_and_extra_re-weighting_factor_in_addition_to_just_averaging.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-4-13_importance_sampling_re-weighting_factor.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-5-1_ppo_surrogate_function.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-5-2_ppo_surrogate_function_re-weighining_pg_which_is_applied_reward_normalization_and_credit_assignment.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-5-3_ppo_surrogate_function_times_a_re-weighting_factor.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-5-4_ppo_surrogate_function_rearrange_equation.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-5-5_ppo_surrogate_function_re-weighting_factor_is_ust_the_product_of_all_the_policy_across_each_step.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-5-6_ppo_surrogate_function_rearrange_equation_again.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-5-7_ppo_surrogate_function_cancel_some_terms.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-5-8_ppo_surrogate_function_equation_simplified.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-5-9_ppo_surrogate_function_equation_rearranged.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-5-10_ppo_surrogate_function_now_we_have_the_approximate_form_of_the_gradient.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-5-11_ppo_surrogate_function_new_gradient.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-6-1_ppo_clipping_policy_updates.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-6-2_ppo_the_policy_or_reward_off.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-6-3_ppo_Lsur_approximates_reward_well_around_the_current_policy_but_diverges_from_actual_reward.png')
from IPython.display import Image
Image(filename='./images/3-3-6-4_ppo_clipped_surrogate_function.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-6-5_ppo_clipped_surrogate_function_if_reward_function_is_zero_the_gradient_zero_policy_update_will_stop.png')
from IPython.display import Image
Image(filename='./images/3-3-6-6_ppo_clipped_surrogate_function_original surrogate function.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-6-7_ppo_clipped_surrogate_function_apply_the_clip_function_to_force_the_ratio_to_be_within_the_interval.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-6-8_ppo_clipped_surrogate_function_we_only_want_to_clip_the_top_part_and_not_the_bottom_part.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-6-9_ppo_clipped_surrogate_function_gives_us_more_conservative_reward.png')
from IPython.display import Image
Image(filename='./images/3-3-6-10_ppo_summary.jpeg')
from IPython.display import Image
Image(filename='./images/3-3-6-11_ppo_paper.jpeg')